library(ggplot2)
library(GenomicRanges)
library(rtracklayer)
library(edgeR)
library(gridExtra)
library(data.table)
library(dplyr)
library(stringr)
library(matrixStats)
library(goseq)
library(GO.db)


options(stringsAsFactors=FALSE)
setwd("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq")
load("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/LNCaP_DHT_edgeR_LTR_GLM_31082020.RData")

Initialization

SAMPLES.NAME <- "LNCaP_DHT"
OUTPUT.FOLDER <- "DGE_"
TABLES.FOLDER <- "tables/"

raw.read.counts <- read.csv(paste0(TABLES.FOLDER, SAMPLES.NAME, "_RUVr_norm_count_table.csv"), row.names=1)
tpm.data <- read.csv(paste0(TABLES.FOLDER, SAMPLES.NAME, "_TPM_table.csv"), row.names=1)

# round the rsem gene expected counts values to the nearest integer to input into edgeR
raw.read.counts <- round(raw.read.counts)
#counts<-counts[, c(2,3,4,5,6,7,8,9,10,11,12,13,14,15,16)]
# matrix of counts with ENSGxxxxxxxx tags

counts <- data.matrix(raw.read.counts) 
colnames(counts)<-colnames(raw.read.counts)
counts<-counts[, c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)]

colnames(tpm.data) <- colnames(raw.read.counts)
tpm.data<-tpm.data[, c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)]
tpm.data.matrix <- data.matrix(tpm.data)

#sample.names <- colnames(raw.read.counts)
sample.names <- colnames(raw.read.counts)
# range of library size/sequencing depth
library.size <- round(colSums(counts)/1e6, 1)
library.size
##  LNCaP_EtOH_16h_Rep1  LNCaP_EtOH_16h_Rep2  LNCaP_EtOH_16h_Rep3 
##                 31.0                 42.7                 35.6 
## LNCaP_DHT_30min_Rep1 LNCaP_DHT_30min_Rep2 LNCaP_DHT_30min_Rep3 
##                 27.8                 33.4                 37.4 
##    LNCaP_DHT_2h_Rep1    LNCaP_DHT_2h_Rep2    LNCaP_DHT_2h_Rep3 
##                 32.8                 31.9                 34.5 
##    LNCaP_DHT_4h_Rep1    LNCaP_DHT_4h_Rep2    LNCaP_DHT_4h_Rep3 
##                 35.5                 32.0                 35.2 
##   LNCaP_DHT_16h_Rep1   LNCaP_DHT_16h_Rep2   LNCaP_DHT_16h_Rep3 
##                 34.5                 37.5                 34.2

PCA plots

Log transformed count values

# perform PCA on log transformed count values
pca <- prcomp(t(log(counts+1)), center=TRUE)
# also see summary(pca)
pc1.var <- round(pca$sdev[1]^2/sum(pca$sdev^2)*100,2) 
pc2.var <- round(pca$sdev[2]^2/sum(pca$sdev^2)*100,2)
pc.data.frame <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], Name=colnames(counts), stringsAsFactors=F)

makeLab <- function(x,pc) paste0("PC",pc,": ", x, "% variance")

ggplot(pc.data.frame, aes(x=PC1, y=PC2, label=Name))+
  geom_point() + geom_text(aes(colour = factor(gsub("_Rep1|_Rep2|_Rep3", "", pc.data.frame$Name)))) +
  xlab(makeLab(pc1.var,1)) + ylab(makeLab(pc2.var,2)) +
  ggtitle("PC1 vs PC2 for log(count) of LNCaP DHT") +
  expand_limits(x=c(-100, 100)) +
  theme(legend.position = "none")
## Warning: Use of `pc.data.frame$Name` is discouraged. Use `Name` instead.
PCA plot of log transformed count values.

PCA plot of log transformed count values.

Log transformed TPM values

# perform PCA on TPM values (use log(TPM) as TPM can be quite swayed by 
# differences in sample library size)
pca <- prcomp(t(log(tpm.data.matrix+1)), center=TRUE)
# also see summary(pca)
pc1.var <- round(pca$sdev[1]^2/sum(pca$sdev^2)*100,2) 
pc2.var <- round(pca$sdev[2]^2/sum(pca$sdev^2)*100,2)
pc.data.frame <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], Name=colnames(tpm.data.matrix))

ggplot(pc.data.frame, aes(x=PC1, y=PC2, label=Name))+
  geom_point() + geom_text(aes(colour=factor(gsub("_Rep1|_Rep2|_Rep3", "", pc.data.frame$Name)))) +
  xlab(makeLab(pc1.var,1)) + ylab(makeLab(pc2.var,2)) +
  ggtitle("PC1 vs PC2 for log(TPM) values of LNCaP DHT") +
  expand_limits(x=c(-50,50)) +
  theme(legend.position = "none")
## Warning: Use of `pc.data.frame$Name` is discouraged. Use `Name` instead.
PCA plot of log transformed TPM values.

PCA plot of log transformed TPM values.


Hierarchical clustering

log(count) values

# hierarchical clustering for log(count) values
hc <- hclust(dist(t(log(counts+1))))       
plot(hc, labels=colnames(counts), main="Clustering samples on log(counts+1)")
Heirachical clustering of log transformed count values.

Heirachical clustering of log transformed count values.

log(TPM) values

# hierarchical clustering for log(TPM) vlaues
hc <- hclust(dist(t(log(tpm.data.matrix+1))))       
plot(hc, labels=colnames(tpm.data.matrix), main="Clustering samples on log(TPM+1)")
Heirachical clustering of log transformed TPM values.

Heirachical clustering of log transformed TPM values.


Differential Gene Expression Analysis

Design Matrix and Count Data

# Filter out ENSGxxxx tags whose coverage is so low that any group differences
# aren't truly "real". 
# filter out tags whose rowcount <= degrees of freedom.
counts <- counts[rowSums(counts) >= 3,]
tpm.data.matrix <- tpm.data.matrix[rowSums(tpm.data.matrix) >= 3, ]

# set up design matrix
group.types <- gsub("_Rep1|_Rep2|_Rep3", "", colnames(counts))
group <- factor(group.types)
design <- model.matrix(~0+group) 
colnames(design) <- gsub("group", "", colnames(design))
design
##    LNCaP_DHT_16h LNCaP_DHT_2h LNCaP_DHT_30min LNCaP_DHT_4h LNCaP_EtOH_16h
## 1              0            0               0            0              1
## 2              0            0               0            0              1
## 3              0            0               0            0              1
## 4              0            0               1            0              0
## 5              0            0               1            0              0
## 6              0            0               1            0              0
## 7              0            1               0            0              0
## 8              0            1               0            0              0
## 9              0            1               0            0              0
## 10             0            0               0            1              0
## 11             0            0               0            1              0
## 12             0            0               0            1              0
## 13             1            0               0            0              0
## 14             1            0               0            0              0
## 15             1            0               0            0              0
## attr(,"assign")
## [1] 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
dge_gene <- DGEList(counts=counts, group=group)
dge_gene$samples
##                                group lib.size norm.factors
## LNCaP_EtOH_16h_Rep1   LNCaP_EtOH_16h 30973551            1
## LNCaP_EtOH_16h_Rep2   LNCaP_EtOH_16h 42737055            1
## LNCaP_EtOH_16h_Rep3   LNCaP_EtOH_16h 35570565            1
## LNCaP_DHT_30min_Rep1 LNCaP_DHT_30min 27807919            1
## LNCaP_DHT_30min_Rep2 LNCaP_DHT_30min 33354952            1
## LNCaP_DHT_30min_Rep3 LNCaP_DHT_30min 37439672            1
## LNCaP_DHT_2h_Rep1       LNCaP_DHT_2h 32758444            1
## LNCaP_DHT_2h_Rep2       LNCaP_DHT_2h 31891935            1
## LNCaP_DHT_2h_Rep3       LNCaP_DHT_2h 34507799            1
## LNCaP_DHT_4h_Rep1       LNCaP_DHT_4h 35471560            1
## LNCaP_DHT_4h_Rep2       LNCaP_DHT_4h 32049868            1
## LNCaP_DHT_4h_Rep3       LNCaP_DHT_4h 35152750            1
## LNCaP_DHT_16h_Rep1     LNCaP_DHT_16h 34467043            1
## LNCaP_DHT_16h_Rep2     LNCaP_DHT_16h 37460339            1
## LNCaP_DHT_16h_Rep3     LNCaP_DHT_16h 34204887            1
###Filter out lowly expressed genes
cpm.cutoff <- 10/(min(dge_gene$samples$lib.size)/1000000)
cpm.cutoff
## [1] 0.3596098
#0.3596098
##Use 0.6 for simplicity
keep <- rowSums(cpm(dge_gene) > 0.6) >= 3 # 3 is used here as each group as 3 replicates
table(keep)
## keep
## FALSE  TRUE 
##  4407 16051

TMM Normalization

# TMM normalization
dge_gene_norm1 <- calcNormFactors(dge_gene, method="TMM")
dge_gene_norm1$samples
##                                group lib.size norm.factors
## LNCaP_EtOH_16h_Rep1   LNCaP_EtOH_16h 30973551    0.9824746
## LNCaP_EtOH_16h_Rep2   LNCaP_EtOH_16h 42737055    1.0345906
## LNCaP_EtOH_16h_Rep3   LNCaP_EtOH_16h 35570565    0.9869952
## LNCaP_DHT_30min_Rep1 LNCaP_DHT_30min 27807919    1.0265634
## LNCaP_DHT_30min_Rep2 LNCaP_DHT_30min 33354952    1.0051532
## LNCaP_DHT_30min_Rep3 LNCaP_DHT_30min 37439672    0.9843880
## LNCaP_DHT_2h_Rep1       LNCaP_DHT_2h 32758444    0.9959084
## LNCaP_DHT_2h_Rep2       LNCaP_DHT_2h 31891935    1.0038058
## LNCaP_DHT_2h_Rep3       LNCaP_DHT_2h 34507799    1.0159487
## LNCaP_DHT_4h_Rep1       LNCaP_DHT_4h 35471560    1.0037345
## LNCaP_DHT_4h_Rep2       LNCaP_DHT_4h 32049868    0.9989522
## LNCaP_DHT_4h_Rep3       LNCaP_DHT_4h 35152750    0.9945556
## LNCaP_DHT_16h_Rep1     LNCaP_DHT_16h 34467043    1.0009730
## LNCaP_DHT_16h_Rep2     LNCaP_DHT_16h 37460339    0.9947044
## LNCaP_DHT_16h_Rep3     LNCaP_DHT_16h 34204887    0.9731077
# perform PCA on log(CPMS)
cpms <- cpm(dge_gene_norm1)
log.cpms <- log(cpms + 1)
# perform PCA on CPM values (use log(CPM) as CPM can be quite affected by differences in 
# sample library size)
pca <- prcomp(t(log.cpms), center=TRUE)
# also see summary(pca)
pc1.var <- round(pca$sdev[1]^2/sum(pca$sdev^2)*100,2) 
pc2.var <- round(pca$sdev[2]^2/sum(pca$sdev^2)*100,2)
pc.data.frame <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], Name=colnames(cpms))
ggplot(pc.data.frame, aes(x=PC1, y=PC2, label=Name))+
  geom_point() + geom_text(aes(colour = factor(gsub("_Rep1|_Rep2|_Rep3", "", pc.data.frame$Name)))) +
  xlab(makeLab(pc1.var,1)) + ylab(makeLab(pc2.var,2)) +
  ggtitle("PC1 vs PC2 for log(CPMS) of LNCaP DHT") +
  expand_limits(x=c(-100, 100)) +
  theme(legend.position = "none")
## Warning: Use of `pc.data.frame$Name` is discouraged. Use `Name` instead.
PCA plot of log transformed CPM values following TMM normalisation.

PCA plot of log transformed CPM values following TMM normalisation.

GLM estimates of variance (dispersion)

# Dispersion
dge_gene_norm1 <- estimateDisp(dge_gene_norm1, design)
dge_gene_norm1$common.dispersion
## [1] 0.005301402
# Fitting a model in edgeR takes several steps. 
# First, you must fit the common dispersion. Then you need to fit a trended model 
# (if you do not fit a trend, the default is to use the common dispersion as a trend). 
# Then you can fit the tagwise dispersion which is a function of this model.
dge_gene_norm1 <- estimateGLMCommonDisp(dge_gene_norm1, design, verbose=TRUE)
## Disp = 0.00514 , BCV = 0.0717
#Disp = 0.00514 , BCV = 0.0717 
dge_gene_norm1 <- estimateGLMTrendedDisp(dge_gene_norm1, design)
dge_gene_norm1 <- estimateGLMTagwiseDisp(dge_gene_norm1, design)

Genewise biological coefficient of variation (BCV)

# plot the genewise biological coefficient of variation (BCV) against gene abundance 
# (in log2 counts per million). 
# It displays the common, trended and tagwise BCV estimates.
plotBCV(dge_gene_norm1)
Plot of the genewise biological coefficient of variation (BCV) against gene abundance (log2 CPM).

Plot of the genewise biological coefficient of variation (BCV) against gene abundance (log2 CPM).

Fit linear model and make contrasts

# Fitting linear model
fit <- glmFit(dge_gene_norm1, design)

# find the tags that are interesting by using a LRT (Likelihood Ratio Test)
# alternative to use ? makeContrasts (limma)
# absolute differences between various pairs of conditions

##EtOH is -1
lrt.30mins.vs.EtOH  <- glmLRT(fit, contrast=c(0, 0, 1, 0, -1))
lrt.2hrs.vs.EtOH  <- glmLRT(fit, contrast=c(0, 1, 0, 0, -1))
lrt.4hrs.vs.EtOH <- glmLRT(fit, contrast=c(0, 0, 0, 1, -1))
lrt.16hrs.vs.EtOH  <- glmLRT(fit, contrast=c(1, 0, 0, 0, -1))

##GLM
fit_glm <- glmQLFit(dge_gene_norm1, design)

glm.30mins.vs.EtOH <- glmQLFTest(fit_glm, contrast=c(0, 0, 1, 0, -1))
glm.2hrs.vs.EtOH <- glmQLFTest(fit_glm, contrast=c(0, 1, 0, 0, -1))
glm.4hrs.vs.EtOH <- glmQLFTest(fit_glm, contrast=c(0, 0, 0, 1, -1))
glm.16hrs.vs.EtOH <- glmQLFTest(fit_glm, contrast=c(1, 0, 0, 0, -1))

# add all contrasts to a list for subsequent processing
lrt.list <- list(lrt.30mins.vs.EtOH=lrt.30mins.vs.EtOH, 
                 lrt.2hrs.vs.EtOH=lrt.2hrs.vs.EtOH,
                 lrt.4hrs.vs.EtOH=lrt.4hrs.vs.EtOH,
                 lrt.16hrs.vs.EtOH=lrt.16hrs.vs.EtOH)

glm.list <- list(glm.30mins.vs.EtOH=glm.30mins.vs.EtOH, 
                 glm.2hrs.vs.EtOH=glm.2hrs.vs.EtOH,
                 glm.4hrs.vs.EtOH=glm.4hrs.vs.EtOH,
                 glm.16hrs.vs.EtOH=glm.16hrs.vs.EtOH)

Get annotated top tag results

Likelihood Ratio Test Results

# annotation
# FROM GTF FILE
setwd("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq")
# FROM GTF FILE
gtf.annotation.file <- "data/gene_annotation.tsv"
gtf.anno <- read.table(gtf.annotation.file, sep= "\t", header=TRUE, stringsAsFactors=F)

colnames(gtf.anno)
## [1] "gene.id"   "gene.name" "chr"       "start"     "end"       "strand"   
## [7] "gene.type"
# annotation file downloaded from http://www.ensembl.org/biomart/martview/ (July2017)
# has following columns: ensembl.gene.ID, chr, start, end, strand, 
# description, HGNC.symbol, entrez.gene.ID
annotation.file <- "data/gene_annotation_mart_export.tsv"
if (!(file.exists(annotation.file))){
  untar(paste0("input/", annotation.file, ".tgz"))
}

DT <- fread(annotation.file)
setnames(DT, gsub(" ", ".", colnames(DT)))
setkey(DT, HGNC.symbol)
head(DT)
##     Gene.stable.ID Chromosome/scaffold.name Gene.start.(bp) Gene.end.(bp)
## 1: ENSG00000279919                        8        19277284      19277573
## 2: ENSG00000283061                        7           12704         27234
## 3: ENSG00000283061                        7           12704         27234
## 4: ENSG00000282572                        7           19018         35489
## 5: ENSG00000282572                        7           19018         35489
## 6: ENSG00000237445                        1        13657311      13659910
##    Strand HGNC.symbol EntrezGene.transcript.name.ID
## 1:      1                                          
## 2:      1                                          
## 3:      1                                          
## 4:     -1                                          
## 5:     -1                                          
## 6:     -1
# implement once here for use with repeated GOseq analysis within lapply
# GOseq
lengths.gene <- read.csv(paste0("tables/LNCaP_DHT_effective_length_table.csv"), row.names=1)
# get the average gene length for each row
lengths <- apply(lengths.gene, 1, mean)

getgoresults <- function(DMgenes, bias.data, output.folder){
  
  # fitting the Probability Weighting Function
  # PWF quantifies how the probability of a gene selected as DE changes as 
  # a function of its transcript length
  pwf <- nullp(DEgenes=DMgenes, genome="hg38", id="geneSymbol", 
               bias.data=bias.data, plot.fit=FALSE)
  pdf(paste0(output.folder, "pwf.goodness.of.fit.plot.pdf"))
  plotPWF(pwf)
  dev.off()
  # calculate  the  over  and  under  expressed  GO
  # categories among DE genes
  # goseqres ordered by GO category over representation amongst DE genes.
  goseqres <- goseq(pwf, "hg38", "geneSymbol")
  # multiple correction
  goseqres$over_fdr <- p.adjust(goseqres$over_represented_pvalue, method="BH")
  goseqres$under_fdr <- p.adjust(goseqres$under_represented_pvalue, method="BH")
  
  over <- goseqres[order(goseqres$over_fdr),]
  
  go <- getgo(names(DMgenes),"hg38","geneSymbol")
  go <- go[DMgenes==1]
  go <- go[!is.na(names(go))]
  # create array of gene name-GO term
  gotable <- array(0, c(0, 2))
  for (i in 1:length(go)){
    gotable <- rbind(gotable, cbind(rep(names(go)[[i]], length(go[[i]])), unlist(go[[i]])))
  }
  # split by go term into genes
  gomap <- split(gotable[,1], gotable[,2])
  # add to goseq results table
  m <- match(goseqres$category, names(gomap))
  over$genes <- sapply(m, function(x) paste(unlist(gomap[x]), collapse=', '))
  over
}

FDR.CUTOFF <- 0.05
LOG.FC.CUTOFF <- 1

res <- lapply(names(lrt.list), function(contrast){
  
  print (contrast)
  comparison <- gsub("lrt.", "", contrast)
  
  partA <- strsplit(comparison, ".vs.")[[1]][1]
  partB <- strsplit(comparison, ".vs.")[[1]][2]
  
  SUB.FOLDER <- paste0(OUTPUT.FOLDER, gsub("lrt.", "", contrast), "/")
  
  if (!(file.exists(SUB.FOLDER))){
    dir.create(SUB.FOLDER)
  }
  # Top table
  # the default method used to adjust p-values for multiple testing is BH.
  tt <- topTags(lrt.list[[contrast]], n=nrow(counts))$table
  
  m <- match(rownames(tt), gtf.anno$gene.id)
  
  # assign the rownames(tt) as the gene_id as more specific with version 
  # number at end as originates from original gtf
  tt$gene.id <- rownames(tt)
  tt$gene.symbol <- gtf.anno$gene.name[m]
  tt$chr <- gtf.anno$chr[m]
  tt$start <- gtf.anno$start[m]
  tt$end <- gtf.anno$end[m]
  tt$strand <- gtf.anno$strand[m]
  tt$gene.type <- gtf.anno$gene.type[m]
  
  m <- match(gsub("\\.[0-9]*", "", rownames(tt)), DT$Gene.stable.ID)
  
  tt$description <- DT$Gene.description[m]
  tt$entrez.gene.id <- DT$Gene.name[m]
  
  # only keep chromosome names beginning with chr1..22, X, Y; 
  # remove patch chromosome assignments 
  # like JH806587.1, JH806587.1 etc
  tt <- tt[grep("chr*", tt$chr),]
  
  #Volcano plot
  #  plot(tt$logFC, -log10(tt$PValue), type="n", xlab="BRG1 KD <- -> scrambled logFC", ylab="-log10(p.value)", main="Volcano plot of Scrambled vs BRG1 KD")
  #plot(tt$logFC, -log10(tt$PValue), type="n", xlab=paste0(partB, " <- -> ", partA, " logFC"), ylab="-log10(p.value)", main=paste0("Volcano plot of ", partA, " vs ", partB))
  #text(tt$logFC, -log10(tt$PValue), labels = tt$gene.symbol, cex=0.5)
  #abline(h=-log10(tt$PValue[sum(tt$FDR < 0.05)]), col="red")
  
  vp.data <- tt[c("gene.symbol", "logFC", "PValue", "FDR")]
  vp.data = mutate(vp.data, sig=ifelse(((vp.data$FDR<FDR.CUTOFF)&(abs(vp.data$logFC)>LOG.FC.CUTOFF)),
                                       paste0("FDR<", FDR.CUTOFF), "Not Sig"))
  
  p <- ggplot(vp.data, aes(logFC, -log10(PValue))) +
    geom_point(aes(col=sig)) +
    scale_color_manual(values=c("red", "black")) +
    geom_text(data=filter(vp.data, ((FDR<FDR.CUTOFF)&(abs(logFC)>LOG.FC.CUTOFF))), aes(label=gene.symbol)) +
    ggtitle(paste0(comparison, " volcano plot")) +
    # edit the x label to signify contrast direction
    xlab(paste0(partB, " <- ", "logFC", " -> ", partA))
  
  print (p)
  
  # The function plotSmear generates a plot of the tagwise log-fold-changes against 
  # log-cpm (analogous to an MA-plot for microarray data). 
  # DE tags are highlighted on the plot
  de2 <- decideTestsDGE(lrt.list[[contrast]], p.value=FDR.CUTOFF, lfc=LOG.FC.CUTOFF)
  de2tags <- rownames(lrt.list[[contrast]])[as.logical(de2)]
  plotSmear(lrt.list[[contrast]], de.tags=de2tags, 
            main=paste0("smear plot with p.value < ", FDR.CUTOFF, " and LFC=", 
                        LOG.FC.CUTOFF, " cutoffs"))
  abline(h = c(-2, 2), col = "blue")
  
  # defining significant as FDR < FDR.CUTOFF and abs(logFC) > LOG.FC.CUTOFF 
  # add DGE.status column
  # UP, DOWN, NC
  tt$DGE.status <- "NC"
  # defining significant as FDR < FDR.CUTOFF and abs(logFC) > LOG.FC.CUTOFF 
  # for UP/DOWN
  if (nrow(tt[((tt$FDR < FDR.CUTOFF)&(tt$logFC > LOG.FC.CUTOFF)),]) > 0){
    tt[((tt$FDR < FDR.CUTOFF)&(tt$logFC > LOG.FC.CUTOFF)),]$DGE.status <- "UP"
  }
  if (nrow(tt[((tt$FDR < FDR.CUTOFF)&(tt$logFC < -LOG.FC.CUTOFF)),]) > 0){
    tt[((tt$FDR < FDR.CUTOFF)&(tt$logFC < -LOG.FC.CUTOFF)),]$DGE.status <- "DOWN"
  }
  # filtering/re-ordering
  column.order <- c(6:13, 1:5)
  tt <- tt[,column.order]
  tt[is.na(tt)] <- ""
  
  #write.table(tt, paste0(SUB.FOLDER, comparison, ".DGE.tsv"), sep="\t", quote=F, row.names=F)
  
  print (nrow(tt[((tt$FDR < FDR.CUTOFF)&(abs(tt$logFC) > LOG.FC.CUTOFF)),]))
  print (paste0("UP: ", nrow(tt[(tt$DGE.status=="UP"),])))
  print (paste0("DOWN: ", nrow(tt[(tt$DGE.status=="DOWN"),])))
  
  # defining significant as FDR < FDR.CUTOFF and abs(logFC) > LOG.FC.CUTOFF
  sigtt <- tt[((tt$FDR < FDR.CUTOFF)&(abs(tt$logFC) > LOG.FC.CUTOFF)),]
  
})
## [1] "lrt.30mins.vs.EtOH"
Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

## [1] 187
## [1] "UP: 113"
## [1] "DOWN: 74"
## [1] "lrt.2hrs.vs.EtOH"
Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

## [1] 928
## [1] "UP: 714"
## [1] "DOWN: 214"
## [1] "lrt.4hrs.vs.EtOH"
Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

## [1] 774
## [1] "UP: 500"
## [1] "DOWN: 274"
## [1] "lrt.16hrs.vs.EtOH"
Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the LRT tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

## [1] 930
## [1] "UP: 597"
## [1] "DOWN: 333"

GLM Results

##GLM
res <- lapply(names(glm.list), function(contrast){
  
  print (contrast)
  comparison <- gsub("glm.", "", contrast)
  
  partA <- strsplit(comparison, ".vs.")[[1]][1]
  partB <- strsplit(comparison, ".vs.")[[1]][2]
  
  SUB.FOLDER <- paste0(OUTPUT.FOLDER, gsub("glm.", "", contrast), "/")
  
  if (!(file.exists(SUB.FOLDER))){
    dir.create(SUB.FOLDER)
  }
  # Top table
  # the default method used to adjust p-values for multiple testing is BH.
  tt <- topTags(glm.list[[contrast]], n=nrow(counts))$table
  
  m <- match(rownames(tt), gtf.anno$gene.id)
  
  # assign the rownames(tt) as the gene_id as more specific with version 
  # number at end as originates from original gtf
  tt$gene.id <- rownames(tt)
  tt$gene.symbol <- gtf.anno$gene.name[m]
  tt$chr <- gtf.anno$chr[m]
  tt$start <- gtf.anno$start[m]
  tt$end <- gtf.anno$end[m]
  tt$strand <- gtf.anno$strand[m]
  tt$gene.type <- gtf.anno$gene.type[m]
  
  m <- match(gsub("\\.[0-9]*", "", rownames(tt)), DT$Gene.stable.ID)
  
  tt$description <- DT$Gene.description[m]
  tt$entrez.gene.id <- DT$Gene.name[m]
  
  # only keep chromosome names beginning with chr1..22, X, Y; 
  # remove patch chromosome assignments 
  # like JH806587.1, JH806587.1 etc
  tt <- tt[grep("chr*", tt$chr),]
  
  #Volcano plot
  #  plot(tt$logFC, -log10(tt$PValue), type="n", xlab="BRG1 KD <- -> scrambled logFC", ylab="-log10(p.value)", main="Volcano plot of Scrambled vs BRG1 KD")
  #plot(tt$logFC, -log10(tt$PValue), type="n", xlab=paste0(partB, " <- -> ", partA, " logFC"), ylab="-log10(p.value)", main=paste0("Volcano plot of ", partA, " vs ", partB))
  #text(tt$logFC, -log10(tt$PValue), labels = tt$gene.symbol, cex=0.5)
  #abline(h=-log10(tt$PValue[sum(tt$FDR < 0.05)]), col="red")
  
  vp.data <- tt[c("gene.symbol", "logFC", "PValue", "FDR")]
  vp.data = mutate(vp.data, sig=ifelse(((vp.data$FDR<FDR.CUTOFF)&(abs(vp.data$logFC)>LOG.FC.CUTOFF)),
                                       paste0("FDR<", FDR.CUTOFF), "Not Sig"))
  
  p <- ggplot(vp.data, aes(logFC, -log10(PValue))) +
    geom_point(aes(col=sig)) +
    scale_color_manual(values=c("red", "black")) +
    geom_text(data=filter(vp.data, ((FDR<FDR.CUTOFF)&(abs(logFC)>LOG.FC.CUTOFF))), aes(label=gene.symbol)) +
    ggtitle(paste0(comparison, " volcano plot")) +
    # edit the x label to signify contrast direction
    xlab(paste0(partB, " <- ", "logFC", " -> ", partA))
  
  print (p)
  
  # The function plotSmear generates a plot of the tagwise log-fold-changes against 
  # log-cpm (analogous to an MA-plot for microarray data). 
  # DE tags are highlighted on the plot
  de2 <- decideTestsDGE(glm.list[[contrast]], p.value=FDR.CUTOFF, lfc=LOG.FC.CUTOFF)
  de2tags <- rownames(lrt.list[[contrast]])[as.logical(de2)]
  plotSmear(glm.list[[contrast]], de.tags=de2tags, 
            main=paste0("smear plot with GLM p.value < ", FDR.CUTOFF, " and LFC=", 
                        LOG.FC.CUTOFF, " cutoffs"))
  abline(h = c(-2, 2), col = "blue")
  
  # defining significant as FDR < FDR.CUTOFF and abs(logFC) > LOG.FC.CUTOFF 
  # add DGE.status column
  # UP, DOWN, NC
  tt$DGE.status <- "NC"
  # defining significant as FDR < FDR.CUTOFF and abs(logFC) > LOG.FC.CUTOFF 
  # for UP/DOWN
  if (nrow(tt[((tt$FDR < FDR.CUTOFF)&(tt$logFC > LOG.FC.CUTOFF)),]) > 0){
    tt[((tt$FDR < FDR.CUTOFF)&(tt$logFC > LOG.FC.CUTOFF)),]$DGE.status <- "UP"
  }
  if (nrow(tt[((tt$FDR < FDR.CUTOFF)&(tt$logFC < -LOG.FC.CUTOFF)),]) > 0){
    tt[((tt$FDR < FDR.CUTOFF)&(tt$logFC < -LOG.FC.CUTOFF)),]$DGE.status <- "DOWN"
  }
  
  # filtering/re-ordering
  column.order <- c(6:13, 1:5)
  tt <- tt[column.order]
  tt[is.na(tt)] <- ""
  
  #write.table(tt, paste0(SUB.FOLDER, comparison, ".GLM.DGE.tsv"), sep="\t", quote=F, row.names=F)
  
  print (nrow(tt[((tt$FDR < FDR.CUTOFF)&(abs(tt$logFC) > LOG.FC.CUTOFF)),]))
  print (paste0("UP: ", nrow(tt[(tt$DGE.status=="UP"),])))
  print (paste0("DOWN: ", nrow(tt[(tt$DGE.status=="DOWN"),])))
  
  # defining significant as FDR < FDR.CUTOFF and abs(logFC) > LOG.FC.CUTOFF
  sigtt <- tt[((tt$FDR < FDR.CUTOFF)&(abs(tt$logFC) > LOG.FC.CUTOFF)),]
  
})
## [1] "glm.30mins.vs.EtOH"
Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

## [1] 122
## [1] "UP: 74"
## [1] "DOWN: 48"
## [1] "glm.2hrs.vs.EtOH"
Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

## [1] 793
## [1] "UP: 622"
## [1] "DOWN: 171"
## [1] "glm.4hrs.vs.EtOH"
Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

## [1] 659
## [1] "UP: 432"
## [1] "DOWN: 227"
## [1] "glm.16hrs.vs.EtOH"
Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

Plot of the GLM tagwise log-fold-changes against log-cpm (analogous to an MA-plot for microarray data). DE tags are highlighted.

## [1] 806
## [1] "UP: 527"
## [1] "DOWN: 279"

Add annotation to TPM data

# add annotation to TPM data and output
m <- match(rownames(tpm.data), gtf.anno$gene.id)

# assign the rownames(tt) as the gene_id as more specific with version number at end
# as originates from original gtf
tpm.data$gene.id <- rownames(tpm.data)
tpm.data$gene.symbol <- gtf.anno$gene.name[m]
tpm.data$chr <- gtf.anno$chr[m]
tpm.data$start <- gtf.anno$start[m]
tpm.data$end <- gtf.anno$end[m]
tpm.data$strand <- gtf.anno$strand[m]
tpm.data$gene.type <- gtf.anno$gene.type[m]

m <- match(gsub("\\.[0-9]*", "", rownames(tpm.data)), DT$Gene.stable.ID)

tpm.data$description <- DT$description[m]
tpm.data$entrez.gene.id <- DT$entrez.gene.ID[m]

# get averages for each set of duplicates/triplicates
tpm.data$mean_EtOH <- 
  round(rowMeans(tpm.data[,grep("EtOH", colnames(tpm.data))]), 2)
tpm.data$mean_30min <- 
  round(rowMeans(tpm.data[,grep("DHT_30min", colnames(tpm.data))]), 2)
tpm.data$mean_2h <- 
  round(rowMeans(tpm.data[,grep("DHT_2h", colnames(tpm.data))]), 2)
tpm.data$mean_4h <- 
  round(rowMeans(tpm.data[,grep("DHT_4h", colnames(tpm.data))]), 2)
tpm.data$mean_16h <- 
  round(rowMeans(tpm.data[,grep("DHT_16h", colnames(tpm.data))]), 2)


column.order <- c(17:27, 1:16)
tpm.data <- tpm.data[column.order]
tpm.data[is.na(tpm.data)] <- ""

#write.table(tpm.data, paste0("annotated_LNCaP_DHT_TPM_table_GLM.tsv"), 
#            sep="\t", quote=F, row.names=F)

Create top tables per comparison

30 mins vs EtOH

# Top table  30mins vs EtOH
# the default method used to adjust p-values for multiple testing is BH.
tt1 <- topTags(lrt.30mins.vs.EtOH, n=nrow(counts))$table

m <- match(rownames(tt1), gtf.anno$gene.id)

# assign the rownames(tt) as the gene_id as more specific with version number at end
# as originates from original gtf
tt1$gene.id <- rownames(tt1)
tt1$gene.symbol <- gtf.anno$gene.name[m]
tt1$chr <- gtf.anno$chr[m]
tt1$start <- gtf.anno$start[m]
tt1$end <- gtf.anno$end[m]
tt1$strand <- gtf.anno$strand[m]
tt1$gene.type <- gtf.anno$gene.type[m]

m <- match(gsub("\\.[0-9]*", "", rownames(tt1)), DT$Ensembl.Gene.ID)

tt1$description <- DT$description[m]
tt1$entrez.gene.id <- DT$entrez.gene.ID[m]

# only keep chromosome names beginning with chr1..22, X, Y; remove patch chromosome assignments 
# like JH806587.1, JH806587.1 etc
tt1 <- tt1[grep("chr*", tt1$chr),]

# GOseq 30mins vs EtOH
#lengths.gene <- read.csv(paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/tables/LNCaP_DHT_effective_length_table.csv"), row.names=1)
# get the average gene length for each row
lengths <- apply(lengths.gene, 1, mean)
bias.data <- lengths[rownames(tt1)]
names(bias.data) <- tt1$gene.symbol
bias.data <- bias.data[!duplicated(names(bias.data))]
if (length(names(bias.data[(names(bias.data) == "")])) > 0){
  bias.data <- bias.data[-which(names(bias.data)=="")]
}
bias.data <- bias.data[-which(bias.data==0)]
if (length(names(bias.data[(is.na(names(bias.data)))])) > 0){
  bias.data <- bias.data[-which(is.na(names(bias.data)))]
}
sigtt1 <- tt1[((tt1$FDR < FDR.CUTOFF)&(abs(tt1$logFC) > LOG.FC.CUTOFF)),]
comparison.UP <- sigtt1$gene.symbol[sigtt1$logFC > 0]
comparison.DOWN <- sigtt1$gene.symbol[sigtt1$logFC < 0]

comparison.UP.DE <- sapply(names(bias.data), function (x) as.numeric(x %in% comparison.UP))
comparison.DOWN.DE <- sapply(names(bias.data), function (x) as.numeric(x %in% comparison.DOWN))

#write.table(tt1, "/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/LNCaP_DHT_30mins.vs.EtOH_DGE.tsv", sep = "\t", quote = F)

table(comparison.UP.DE)
## comparison.UP.DE
##     0     1 
## 20322   113
#comparison.UP.DE
#0     1 
#20322   113 

table(comparison.DOWN.DE)
## comparison.DOWN.DE
##     0     1 
## 20361    74
#comparison.DOWN.DE
#0     1 
#20361    74 

library(goseq)
library(magrittr)
library(dplyr)
library(ggplot2)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
## Loading required package: GenomicFeatures
## Warning: package 'GenomicFeatures' was built under R version 4.1.1
pwf1_up <- nullp(comparison.UP.DE,"hg38","geneSymbol")
## Can't find hg38/geneSymbol length data in genLenDataBase...
## Warning in grep(txdbPattern, installedPackages): argument 'pattern' has length >
## 1 and only the first element will be used
## 
## Found the annotation package, TxDb.Hsapiens.UCSC.hg38.knownGene
## Trying to get the gene lengths from it.
Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

pwf1_down <- nullp(comparison.DOWN.DE,"hg38","geneSymbol")
## Can't find hg38/geneSymbol length data in genLenDataBase...
## Warning in grep(txdbPattern, installedPackages): argument 'pattern' has length >
## 1 and only the first element will be used
## Found the annotation package, TxDb.Hsapiens.UCSC.hg38.knownGene
## Trying to get the gene lengths from it.
## Warning in pcls(G): initial point very close to some inequality constraints
Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

#Using the Wallenius approximation
GO.wall_UP <- goseq(pwf1_up,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_UP)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 13635 GO:0052697            0.0001631130                0.9999993          2
## 13633 GO:0052695            0.0006989416                0.9999925          2
## 1600  GO:0003924            0.0010997855                0.9998750          5
## 2768  GO:0006063            0.0013022608                0.9999801          2
## 6348  GO:0019585            0.0013022608                0.9999801          2
## 2334  GO:0005200            0.0015118416                0.9999236          3
##       numInCat                                   term ontology
## 13635        7             xenobiotic glucuronidation       BP
## 13633       14               cellular glucuronidation       BP
## 1600       281                        GTPase activity       MF
## 2768        19          uronic acid metabolic process       BP
## 6348        19          glucuronate metabolic process       BP
## 2334        81 structural constituent of cytoskeleton       MF
#write.table(GO.wall_UP, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_30mins.vs.EtOH/LNCaP_DHT_30mins.vs.EtOH.GOterms_UP.tsv"), sep="\t", quote=F, row.names=F)
##visualise Up 30mins vs EtOH
##plot TOP 10 UP
GO.wall_UP %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count")
Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

##BP only
GO.wall_UP_BP <- goseq(pwf1_up,"hg38","geneSymbol",test.cats=c("GO:BP"))
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO.wall_UP_BP %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count", title = "30mins_DHT_GO.wall_UP_BP")
Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

GO.wall_DOWN_BP <- goseq(pwf1_down,"hg38","geneSymbol",test.cats=c("GO:BP"))
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO.wall_DOWN_BP %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count", title = "30mins_DHT_GO.wall_DOWN_BP")
Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

GO.wall_DOWN <- goseq(pwf1_down,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_DOWN)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 10409 GO:0042613            0.000000e+00                        1          6
## 10407 GO:0042611            9.273757e-12                        1          6
## 15697 GO:0071556            2.784702e-11                        1          6
## 17161 GO:0098553            2.784702e-11                        1          6
## 17171 GO:0098576            1.439307e-10                        1          6
## 7189  GO:0030669            3.142357e-10                        1          6
##       numInCat
## 10409       12
## 10407       19
## 15697       22
## 17161       22
## 17171       28
## 7189        31
##                                                                       term
## 10409                                         MHC class II protein complex
## 10407                                                  MHC protein complex
## 15697 integral component of lumenal side of endoplasmic reticulum membrane
## 17161                       lumenal side of endoplasmic reticulum membrane
## 17171                                             lumenal side of membrane
## 7189                            clathrin-coated endocytic vesicle membrane
##       ontology
## 10409       CC
## 10407       CC
## 15697       CC
## 17161       CC
## 17171       CC
## 7189        CC
#write.table(GO.wall_DOWN, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_30mins.vs.EtOH/LNCaP_DHT_30mins.vs.EtOH.GOterms_DOWN.tsv"), sep="\t", quote=F, row.names=F)


##Visualise top 10 down
GO.wall_DOWN %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count")
Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

#Using random sampling
GO.sampUP <- goseq(pwf1_up,"hg38","geneSymbol",method="Sampling",repcnt=1000)
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Running the simulation...
## 0 %   
0 %   
0 %   
0 %   
0 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
100 %   
100 %   
100 %   
100 %   
100 %   
100 %   
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
#Compare both methods
plot(log10(GO.wall_UP[,2]), log10(GO.sampUP[match(GO.sampUP[,1],GO.wall_UP[,1]),2]),
     xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)",
     xlim=c(-3,0))
abline(0,1,col=3,lty=2)
Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

GO.sampDOWN <- goseq(pwf1_down,"hg38","geneSymbol",method="Sampling",repcnt=1000)
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Running the simulation...
## 0 %   
0 %   
0 %   
0 %   
0 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
100 %   
100 %   
100 %   
100 %   
100 %   
100 %   
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
#Compare both methods
plot(log10(GO.wall_DOWN[,2]), log10(GO.sampDOWN[match(GO.sampDOWN[,1],GO.wall_DOWN[,1]),2]),
     xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)",
     xlim=c(-3,0))
abline(0,1,col=3,lty=2)
Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

Resulting plots for various methods of analysing GO term enrichment results in the top tags of the DHT 30 mins vs EtOH comparison.

2hrs vs EtOH

#### Top table 2hrs vs EtOH
# the default method used to adjust p-values for multiple testing is BH.
tt2 <- topTags(lrt.2hrs.vs.EtOH, n=nrow(counts))$table

m <- match(rownames(tt2), gtf.anno$gene.id)

# assign the rownames(tt) as the gene_id as more specific with version number at end
# as originates from original gtf
tt2$gene.id <- rownames(tt2)
tt2$gene.symbol <- gtf.anno$gene.name[m]
tt2$chr <- gtf.anno$chr[m]
tt2$start <- gtf.anno$start[m]
tt2$end <- gtf.anno$end[m]
tt2$strand <- gtf.anno$strand[m]
tt2$gene.type <- gtf.anno$gene.type[m]

m <- match(gsub("\\.[0-9]*", "", rownames(tt2)), DT$Ensembl.Gene.ID)

tt2$description <- DT$description[m]
tt2$entrez.gene.id <- DT$entrez.gene.ID[m]

# only keep chromosome names beginning with chr1..22, X, Y; remove patch chromosome assignments 
# like JH806587.1, JH806587.1 etc
tt2 <- tt2[grep("chr*", tt1$chr),]

# GOseq 2hrs vs EtOH
bias.data <- lengths[rownames(tt2)]
names(bias.data) <- tt2$gene.symbol
bias.data <- bias.data[!duplicated(names(bias.data))]
if (length(names(bias.data[(names(bias.data) == "")])) > 0){
  bias.data <- bias.data[-which(names(bias.data)=="")]
}
bias.data <- bias.data[-which(bias.data==0)]
if (length(names(bias.data[(is.na(names(bias.data)))])) > 0){
  bias.data <- bias.data[-which(is.na(names(bias.data)))]
}
sigtt2 <- tt2[((tt2$FDR < FDR.CUTOFF)&(abs(tt2$logFC) > LOG.FC.CUTOFF)),]
comparison.UP <- sigtt2$gene.symbol[sigtt2$logFC > 0]
comparison.DOWN <- sigtt2$gene.symbol[sigtt2$logFC < 0]

comparison.UP.DE <- sapply(names(bias.data), function (x) as.numeric(x %in% comparison.UP))
comparison.DOWN.DE <- sapply(names(bias.data), function (x) as.numeric(x %in% comparison.DOWN))

table(comparison.UP.DE)
## comparison.UP.DE
##     0     1 
## 19721   714
#comparison.UP.DE
#0     1 
#19721   714 

table(comparison.DOWN.DE)
## comparison.DOWN.DE
##     0     1 
## 20221   214
#comparison.DOWN.DE
#0     1 
#20221   214 

dev.off()
## null device 
##           1
pwf1_up <- nullp(comparison.UP.DE,"hg38","geneSymbol")
## Can't find hg38/geneSymbol length data in genLenDataBase...
## Warning in grep(txdbPattern, installedPackages): argument 'pattern' has length >
## 1 and only the first element will be used
## Found the annotation package, TxDb.Hsapiens.UCSC.hg38.knownGene
## Trying to get the gene lengths from it.
## Warning in pcls(G): initial point very close to some inequality constraints
pwf1_down <- nullp(comparison.DOWN.DE,"hg38","geneSymbol")
## Can't find hg38/geneSymbol length data in genLenDataBase...
## Warning in grep(txdbPattern, installedPackages): argument 'pattern' has length >
## 1 and only the first element will be used
## Found the annotation package, TxDb.Hsapiens.UCSC.hg38.knownGene
## Trying to get the gene lengths from it.
#Using the Wallenius approximation
GO.wall_UP <- goseq(pwf1_up,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_UP)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 9048  GO:0034508            1.940401e-07                1.0000000          9
## 17156 GO:0098536            8.118984e-07                1.0000000          4
## 133   GO:0000280            1.488268e-06                0.9999996         23
## 17288 GO:0098813            3.607013e-06                0.9999992         17
## 12452 GO:0048285            9.082259e-06                0.9999972         23
## 17957 GO:0140013            1.029189e-05                0.9999982         12
##       numInCat                           term ontology
## 9048        52    centromere complex assembly       BP
## 17156        5                    deuterosome       CC
## 133        374               nuclear division       BP
## 17288      238 nuclear chromosome segregation       BP
## 12452      419              organelle fission       BP
## 17957      132       meiotic nuclear division       BP
#write.table(GO.wall_UP, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_2hrs.vs.EtOH/LNCaP_DHT_2hrs.vs.EtOH.GOterms_UP.tsv"), sep="\t", quote=F, row.names=F)

##visualise Up 2hrs vs EtOH
##plot TOP 10 UP
GO.wall_UP %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count")

##BP only
GO.wall_UP_BP <- goseq(pwf1_up,"hg38","geneSymbol",test.cats=c("GO:BP"))
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO.wall_UP_BP %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count", title = "2hrs_DHT_GO.wall_UP_BP")

GO.wall_DOWN_BP <- goseq(pwf1_down,"hg38","geneSymbol",test.cats=c("GO:BP"))
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO.wall_DOWN_BP %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count", title = "2hrs_DHT_GO.wall_DOWN_BP")



GO.wall_DOWN <- goseq(pwf1_down,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_DOWN)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 10409 GO:0042613            3.382198e-12                        1          7
## 10407 GO:0042611            4.294504e-10                        1          7
## 8053  GO:0032395            2.076583e-09                        1          5
## 15697 GO:0071556            6.443080e-08                        1          6
## 17161 GO:0098553            6.443080e-08                        1          6
## 7433  GO:0031226            9.903315e-08                        1         30
##       numInCat
## 10409       12
## 10407       19
## 8053         7
## 15697       22
## 17161       22
## 7433      1047
##                                                                       term
## 10409                                         MHC class II protein complex
## 10407                                                  MHC protein complex
## 8053                                        MHC class II receptor activity
## 15697 integral component of lumenal side of endoplasmic reticulum membrane
## 17161                       lumenal side of endoplasmic reticulum membrane
## 7433                                intrinsic component of plasma membrane
##       ontology
## 10409       CC
## 10407       CC
## 8053        MF
## 15697       CC
## 17161       CC
## 7433        CC
#write.table(GO.wall_DOWN, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_2hrs.vs.EtOH/LNCaP_DHT_2hrs.vs.EtOH.GOterms_DOWN.tsv"), sep="\t", quote=F, row.names=F)


##Visualise top 10 down
GO.wall_DOWN %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count")



#Using the Wallenius approximation
GO.wall_UP <- goseq(pwf1_up,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_UP)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 9048  GO:0034508            1.940401e-07                1.0000000          9
## 17156 GO:0098536            8.118984e-07                1.0000000          4
## 133   GO:0000280            1.488268e-06                0.9999996         23
## 17288 GO:0098813            3.607013e-06                0.9999992         17
## 12452 GO:0048285            9.082259e-06                0.9999972         23
## 17957 GO:0140013            1.029189e-05                0.9999982         12
##       numInCat                           term ontology
## 9048        52    centromere complex assembly       BP
## 17156        5                    deuterosome       CC
## 133        374               nuclear division       BP
## 17288      238 nuclear chromosome segregation       BP
## 12452      419              organelle fission       BP
## 17957      132       meiotic nuclear division       BP
#write.table(GO.wall_UP, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_2hrs.vs.EtOH/LNCaP_DHT_2hrs.vs.EtOH.GOterms_UP.tsv"), sep="\t", quote=F, row.names=F)

GO.wall_DOWN <- goseq(pwf1_down,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_DOWN)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 10409 GO:0042613            3.382198e-12                        1          7
## 10407 GO:0042611            4.294504e-10                        1          7
## 8053  GO:0032395            2.076583e-09                        1          5
## 15697 GO:0071556            6.443080e-08                        1          6
## 17161 GO:0098553            6.443080e-08                        1          6
## 7433  GO:0031226            9.903315e-08                        1         30
##       numInCat
## 10409       12
## 10407       19
## 8053         7
## 15697       22
## 17161       22
## 7433      1047
##                                                                       term
## 10409                                         MHC class II protein complex
## 10407                                                  MHC protein complex
## 8053                                        MHC class II receptor activity
## 15697 integral component of lumenal side of endoplasmic reticulum membrane
## 17161                       lumenal side of endoplasmic reticulum membrane
## 7433                                intrinsic component of plasma membrane
##       ontology
## 10409       CC
## 10407       CC
## 8053        MF
## 15697       CC
## 17161       CC
## 7433        CC
#write.table(GO.wall_DOWN, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_2hrs.vs.EtOH/LNCaP_DHT_2hrs.vs.EtOH.GOterms_DOWN.tsv"), sep="\t", quote=F, row.names=F)

#Using random sampling
GO.sampUP <- goseq(pwf1_up,"hg38","geneSymbol",method="Sampling",repcnt=1000)
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Running the simulation...
## 0 %   
0 %   
0 %   
0 %   
0 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
100 %   
100 %   
100 %   
100 %   
100 %   
100 %   
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
#Compare both methods
plot(log10(GO.wall_UP[,2]), log10(GO.sampUP[match(GO.sampUP[,1],GO.wall_UP[,1]),2]),
     xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)",
     xlim=c(-3,0))
abline(0,1,col=3,lty=2)

GO.sampDOWN <- goseq(pwf1_down,"hg38","geneSymbol",method="Sampling",repcnt=1000)
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Running the simulation...
## 0 %   
0 %   
0 %   
0 %   
0 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
100 %   
100 %   
100 %   
100 %   
100 %   
100 %   
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
#Compare both methods
plot(log10(GO.wall_DOWN[,2]), log10(GO.sampDOWN[match(GO.sampDOWN[,1],GO.wall_DOWN[,1]),2]),
     xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)",
     xlim=c(-3,0))
abline(0,1,col=3,lty=2)

4hrs vs EtOH

#### Top table 4hrs vs EtOH
# the default method used to adjust p-values for multiple testing is BH.
tt3 <- topTags(lrt.4hrs.vs.EtOH, n=nrow(counts))$table

m <- match(rownames(tt3), gtf.anno$gene.id)

# assign the rownames(tt) as the gene_id as more specific with version number at end
# as originates from original gtf
tt3$gene.id <- rownames(tt3)
tt3$gene.symbol <- gtf.anno$gene.name[m]
tt3$chr <- gtf.anno$chr[m]
tt3$start <- gtf.anno$start[m]
tt3$end <- gtf.anno$end[m]
tt3$strand <- gtf.anno$strand[m]
tt3$gene.type <- gtf.anno$gene.type[m]

m <- match(gsub("\\.[0-9]*", "", rownames(tt3)), DT$Ensembl.Gene.ID)

tt3$description <- DT$description[m]
tt3$entrez.gene.id <- DT$entrez.gene.ID[m]

# only keep chromosome names beginning with chr1..22, X, Y; remove patch chromosome assignments 
# like JH806587.1, JH806587.1 etc
tt3 <- tt3[grep("chr*", tt1$chr),]

# GOseq 4hrs vs EtOH
bias.data <- lengths[rownames(tt3)]
names(bias.data) <- tt3$gene.symbol
bias.data <- bias.data[!duplicated(names(bias.data))]
if (length(names(bias.data[(names(bias.data) == "")])) > 0){
  bias.data <- bias.data[-which(names(bias.data)=="")]
}
bias.data <- bias.data[-which(bias.data==0)]
if (length(names(bias.data[(is.na(names(bias.data)))])) > 0){
  bias.data <- bias.data[-which(is.na(names(bias.data)))]
}
sigtt3 <- tt3[((tt3$FDR < FDR.CUTOFF)&(abs(tt3$logFC) > LOG.FC.CUTOFF)),]
comparison.UP <- sigtt3$gene.symbol[sigtt3$logFC > 0]
comparison.DOWN <- sigtt3$gene.symbol[sigtt3$logFC < 0]

comparison.UP.DE <- sapply(names(bias.data), function (x) as.numeric(x %in% comparison.UP))
comparison.DOWN.DE <- sapply(names(bias.data), function (x) as.numeric(x %in% comparison.DOWN))

table(comparison.UP.DE)
## comparison.UP.DE
##     0     1 
## 19935   500
#comparison.UP.DE
#0     1 
#19935   500

table(comparison.DOWN.DE)
## comparison.DOWN.DE
##     0     1 
## 20161   274
#comparison.DOWN.DE
#0     1 
#20161   274 

dev.off()
## null device 
##           1
pwf1_up <- nullp(comparison.UP.DE,"hg38","geneSymbol")
## Can't find hg38/geneSymbol length data in genLenDataBase...
## Warning in grep(txdbPattern, installedPackages): argument 'pattern' has length >
## 1 and only the first element will be used
## Found the annotation package, TxDb.Hsapiens.UCSC.hg38.knownGene
## Trying to get the gene lengths from it.
## Warning in pcls(G): initial point very close to some inequality constraints
pwf1_down <- nullp(comparison.DOWN.DE,"hg38","geneSymbol")
## Can't find hg38/geneSymbol length data in genLenDataBase...
## Warning in grep(txdbPattern, installedPackages): argument 'pattern' has length >
## 1 and only the first element will be used
## Found the annotation package, TxDb.Hsapiens.UCSC.hg38.knownGene
## Trying to get the gene lengths from it.
## Warning in pcls(G): initial point very close to some inequality constraints
#Using the Wallenius approximation
GO.wall_UP <- goseq(pwf1_up,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_UP)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 15921 GO:0071944            2.599407e-07                1.0000000        108
## 17156 GO:0098536            4.468379e-07                1.0000000          4
## 2662  GO:0005886            2.316818e-06                0.9999992         99
## 6834  GO:0023052            1.759305e-05                0.9999905        115
## 3462  GO:0007165            1.903253e-05                0.9999897        107
## 12961 GO:0050896            2.726977e-05                0.9999849        146
##       numInCat                 term ontology
## 15921     3906       cell periphery       CC
## 17156        5          deuterosome       CC
## 2662      3599      plasma membrane       CC
## 6834      4632            signaling       BP
## 3462      4264  signal transduction       BP
## 12961     6441 response to stimulus       BP
#write.table(GO.wall_UP, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_4hrs.vs.EtOH/LNCaP_DHT_4hrs.vs.EtOH.GOterms_UP.tsv"), sep="\t", quote=F, row.names=F)

##visualise Up 4hrs vs EtOH
##plot TOP 10 UP
GO.wall_UP %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count")

##BP only
GO.wall_UP_BP <- goseq(pwf1_up,"hg38","geneSymbol",test.cats=c("GO:BP"))
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO.wall_UP_BP %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count", title = "4hrs_DHT_GO.wall_UP_BP")

GO.wall_DOWN_BP <- goseq(pwf1_down,"hg38","geneSymbol",test.cats=c("GO:BP"))
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO.wall_DOWN_BP %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count", title = "4hrs_DHT_GO.wall_DOWN_BP")


GO.wall_DOWN <- goseq(pwf1_down,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_DOWN)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 10409 GO:0042613            1.508778e-13                        1          8
## 15921 GO:0071944            2.430092e-12                        1         95
## 2663  GO:0005887            1.550672e-11                        1         42
## 7433  GO:0031226            2.202798e-11                        1         43
## 10407 GO:0042611            2.251078e-11                        1          8
## 2662  GO:0005886            2.681306e-11                        1         88
##       numInCat                                   term ontology
## 10409       12           MHC class II protein complex       CC
## 15921     3906                         cell periphery       CC
## 2663       991  integral component of plasma membrane       CC
## 7433      1047 intrinsic component of plasma membrane       CC
## 10407       19                    MHC protein complex       CC
## 2662      3599                        plasma membrane       CC
#write.table(GO.wall_DOWN, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_4hrs.vs.EtOH/LNCaP_DHT_4hrs.vs.EtOH.GOterms_DOWN.tsv"), sep="\t", quote=F, row.names=F)


##Visualise top 10 down
GO.wall_DOWN %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count")

#Using the Wallenius approximation
GO.wall_UP <- goseq(pwf1_up,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_UP)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 15921 GO:0071944            2.599407e-07                1.0000000        108
## 17156 GO:0098536            4.468379e-07                1.0000000          4
## 2662  GO:0005886            2.316818e-06                0.9999992         99
## 6834  GO:0023052            1.759305e-05                0.9999905        115
## 3462  GO:0007165            1.903253e-05                0.9999897        107
## 12961 GO:0050896            2.726977e-05                0.9999849        146
##       numInCat                 term ontology
## 15921     3906       cell periphery       CC
## 17156        5          deuterosome       CC
## 2662      3599      plasma membrane       CC
## 6834      4632            signaling       BP
## 3462      4264  signal transduction       BP
## 12961     6441 response to stimulus       BP
#write.table(GO.wall_UP, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_4hrs.vs.EtOH/LNCaP_DHT_4hrs.vs.EtOH.GOterms_UP.tsv"), sep="\t", quote=F, row.names=F)

GO.wall_DOWN <- goseq(pwf1_down,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_DOWN)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 10409 GO:0042613            1.508778e-13                        1          8
## 15921 GO:0071944            2.430092e-12                        1         95
## 2663  GO:0005887            1.550672e-11                        1         42
## 7433  GO:0031226            2.202798e-11                        1         43
## 10407 GO:0042611            2.251078e-11                        1          8
## 2662  GO:0005886            2.681306e-11                        1         88
##       numInCat                                   term ontology
## 10409       12           MHC class II protein complex       CC
## 15921     3906                         cell periphery       CC
## 2663       991  integral component of plasma membrane       CC
## 7433      1047 intrinsic component of plasma membrane       CC
## 10407       19                    MHC protein complex       CC
## 2662      3599                        plasma membrane       CC
#write.table(GO.wall_DOWN, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_4hrs.vs.EtOH/LNCaP_DHT_4hrs.vs.EtOH.GOterms_DOWN.tsv"), sep="\t", quote=F, row.names=F)

#Using random sampling
GO.sampUP <- goseq(pwf1_up,"hg38","geneSymbol",method="Sampling",repcnt=1000)
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Running the simulation...
## 0 %   
0 %   
0 %   
0 %   
0 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
100 %   
100 %   
100 %   
100 %   
100 %   
100 %   
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
#Compare both methods
plot(log10(GO.wall_UP[,2]), log10(GO.sampUP[match(GO.sampUP[,1],GO.wall_UP[,1]),2]),
     xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)",
     xlim=c(-3,0))
abline(0,1,col=3,lty=2)

GO.sampDOWN <- goseq(pwf1_down,"hg38","geneSymbol",method="Sampling",repcnt=1000)
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Running the simulation...
## 0 %   
0 %   
0 %   
0 %   
0 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
100 %   
100 %   
100 %   
100 %   
100 %   
100 %   
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
#Compare both methods
plot(log10(GO.wall_DOWN[,2]), log10(GO.sampDOWN[match(GO.sampDOWN[,1],GO.wall_DOWN[,1]),2]),
     xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)",
     xlim=c(-3,0))
abline(0,1,col=3,lty=2)

16hrs vs EtOH

#### Top table 16hrs vs EtOH
# the default method used to adjust p-values for multiple testing is BH.
tt4 <- topTags(lrt.16hrs.vs.EtOH, n=nrow(counts))$table

m <- match(rownames(tt4), gtf.anno$gene.id)

# assign the rownames(tt) as the gene_id as more specific with version number at end
# as originates from original gtf
tt4$gene.id <- rownames(tt4)
tt4$gene.symbol <- gtf.anno$gene.name[m]
tt4$chr <- gtf.anno$chr[m]
tt4$start <- gtf.anno$start[m]
tt4$end <- gtf.anno$end[m]
tt4$strand <- gtf.anno$strand[m]
tt4$gene.type <- gtf.anno$gene.type[m]

m <- match(gsub("\\.[0-9]*", "", rownames(tt4)), DT$Ensembl.Gene.ID)

tt4$description <- DT$description[m]
tt4$entrez.gene.id <- DT$entrez.gene.ID[m]

# only keep chromosome names beginning with chr1..22, X, Y; remove patch chromosome assignments 
# like JH806587.1, JH806587.1 etc
tt4 <- tt4[grep("chr*", tt1$chr),]

# GOseq 16hrs vs EtOH
bias.data <- lengths[rownames(tt4)]
names(bias.data) <- tt4$gene.symbol
bias.data <- bias.data[!duplicated(names(bias.data))]
if (length(names(bias.data[(names(bias.data) == "")])) > 0){
  bias.data <- bias.data[-which(names(bias.data)=="")]
}
bias.data <- bias.data[-which(bias.data==0)]
if (length(names(bias.data[(is.na(names(bias.data)))])) > 0){
  bias.data <- bias.data[-which(is.na(names(bias.data)))]
}
sigtt4 <- tt4[((tt4$FDR < FDR.CUTOFF)&(abs(tt4$logFC) > LOG.FC.CUTOFF)),]
comparison.UP <- sigtt4$gene.symbol[sigtt4$logFC > 0]
comparison.DOWN <- sigtt4$gene.symbol[sigtt4$logFC < 0]

comparison.UP.DE <- sapply(names(bias.data), function (x) as.numeric(x %in% comparison.UP))
comparison.DOWN.DE <- sapply(names(bias.data), function (x) as.numeric(x %in% comparison.DOWN))

table(comparison.UP.DE)
## comparison.UP.DE
##     0     1 
## 19838   597
#comparison.UP.DE
#0     1 
#19838   597 

table(comparison.DOWN.DE)
## comparison.DOWN.DE
##     0     1 
## 20102   333
#comparison.DOWN.DE
#0     1 
#20102   333

dev.off()
## null device 
##           1
pwf1_up <- nullp(comparison.UP.DE,"hg38","geneSymbol")
## Can't find hg38/geneSymbol length data in genLenDataBase...
## Warning in grep(txdbPattern, installedPackages): argument 'pattern' has length >
## 1 and only the first element will be used
## Found the annotation package, TxDb.Hsapiens.UCSC.hg38.knownGene
## Trying to get the gene lengths from it.
## Warning in pcls(G): initial point very close to some inequality constraints
pwf1_down <- nullp(comparison.DOWN.DE,"hg38","geneSymbol")
## Can't find hg38/geneSymbol length data in genLenDataBase...
## Warning in grep(txdbPattern, installedPackages): argument 'pattern' has length >
## 1 and only the first element will be used
## Found the annotation package, TxDb.Hsapiens.UCSC.hg38.knownGene
## Trying to get the gene lengths from it.
## Warning in pcls(G): initial point very close to some inequality constraints
#Using the Wallenius approximation
GO.wall_UP <- goseq(pwf1_up,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_UP)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 15921 GO:0071944            1.177481e-11                        1        145
## 2662  GO:0005886            1.135104e-10                        1        134
## 2479  GO:0005576            3.053634e-10                        1        108
## 1284  GO:0003008            7.493742e-10                        1         63
## 8142  GO:0032501            3.531181e-08                        1        166
## 10210 GO:0042221            4.014972e-08                        1        113
##       numInCat                             term ontology
## 15921     3906                   cell periphery       CC
## 2662      3599                  plasma membrane       CC
## 2479      2793             extracellular region       CC
## 1284      1234                   system process       BP
## 8142      5252 multicellular organismal process       BP
## 10210     3192             response to chemical       BP
#write.table(GO.wall_UP, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_16hrs.vs.EtOH/LNCaP_DHT_16hrs.vs.EtOH.GOterms_UP.tsv"), sep="\t", quote=F, row.names=F)

##visualise Up 16hrs vs EtOH
##plot TOP 10 UP
GO.wall_UP %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count")

##BP only
GO.wall_UP_BP <- goseq(pwf1_up,"hg38","geneSymbol",test.cats=c("GO:BP"))
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO.wall_UP_BP %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count", title = "16hrs_DHT_GO.wall_UP_BP")

GO.wall_DOWN_BP <- goseq(pwf1_down,"hg38","geneSymbol",test.cats=c("GO:BP"))
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO.wall_DOWN_BP %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count", title = "16hrs_DHT_GO.wall_DOWN_BP")

GO.wall_DOWN <- goseq(pwf1_down,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_DOWN)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 15921 GO:0071944            1.028918e-15                        1        124
## 8142  GO:0032501            1.516505e-15                        1        148
## 2662  GO:0005886            6.055086e-14                        1        114
## 7431  GO:0031224            5.975453e-13                        1        109
## 5449  GO:0016021            4.385147e-12                        1        105
## 12642 GO:0048731            3.775266e-11                        1        106
##       numInCat                             term ontology
## 15921     3906                   cell periphery       CC
## 8142      5252 multicellular organismal process       BP
## 2662      3599                  plasma membrane       CC
## 7431      3495  intrinsic component of membrane       CC
## 5449      3397   integral component of membrane       CC
## 12642     3563               system development       BP
#write.table(GO.wall_DOWN, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_16hrs.vs.EtOH/LNCaP_DHT_16hrs.vs.EtOH.GOterms_DOWN.tsv"), sep="\t", quote=F, row.names=F)

##Visualise top 10 down
GO.wall_DOWN %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count")



#Using the Wallenius approximation
GO.wall_UP <- goseq(pwf1_up,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_UP)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 15921 GO:0071944            1.177481e-11                        1        145
## 2662  GO:0005886            1.135104e-10                        1        134
## 2479  GO:0005576            3.053634e-10                        1        108
## 1284  GO:0003008            7.493742e-10                        1         63
## 8142  GO:0032501            3.531181e-08                        1        166
## 10210 GO:0042221            4.014972e-08                        1        113
##       numInCat                             term ontology
## 15921     3906                   cell periphery       CC
## 2662      3599                  plasma membrane       CC
## 2479      2793             extracellular region       CC
## 1284      1234                   system process       BP
## 8142      5252 multicellular organismal process       BP
## 10210     3192             response to chemical       BP
#write.table(GO.wall_UP, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_16hrs.vs.EtOH/LNCaP_DHT_16hrs.vs.EtOH.GOterms_UP.tsv"), sep="\t", quote=F, row.names=F)

GO.wall_DOWN <- goseq(pwf1_down,"hg38","geneSymbol")
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall_DOWN)
##         category over_represented_pvalue under_represented_pvalue numDEInCat
## 15921 GO:0071944            1.028918e-15                        1        124
## 8142  GO:0032501            1.516505e-15                        1        148
## 2662  GO:0005886            6.055086e-14                        1        114
## 7431  GO:0031224            5.975453e-13                        1        109
## 5449  GO:0016021            4.385147e-12                        1        105
## 12642 GO:0048731            3.775266e-11                        1        106
##       numInCat                             term ontology
## 15921     3906                   cell periphery       CC
## 8142      5252 multicellular organismal process       BP
## 2662      3599                  plasma membrane       CC
## 7431      3495  intrinsic component of membrane       CC
## 5449      3397   integral component of membrane       CC
## 12642     3563               system development       BP
#write.table(GO.wall_DOWN, paste0("/Volumes/griw/Cancer-Epigenetics/Projects/LNCaP_VCaP_DHT_Hi-C/Elyssa_2021_analysis_and_integration/RNA-Seq/DGE_16hrs.vs.EtOH/LNCaP_DHT_16hrs.vs.EtOH.GOterms_DOWN.tsv"), sep="\t", quote=F, row.names=F)

#Using random sampling
GO.sampUP <- goseq(pwf1_up,"hg38","geneSymbol",method="Sampling",repcnt=1000)
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Running the simulation...
## 0 %   
0 %   
0 %   
0 %   
0 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
100 %   
100 %   
100 %   
100 %   
100 %   
100 %   
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
#Compare both methods
plot(log10(GO.wall_UP[,2]), log10(GO.sampUP[match(GO.sampUP[,1],GO.wall_UP[,1]),2]),
     xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)",
     xlim=c(-3,0))
abline(0,1,col=3,lty=2)

GO.sampDOWN <- goseq(pwf1_down,"hg38","geneSymbol",method="Sampling",repcnt=1000)
## Fetching GO annotations...
## For 6243 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Running the simulation...
## 0 %   
0 %   
0 %   
0 %   
0 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
1 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
2 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
3 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
4 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
5 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
6 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
7 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
8 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
9 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
10 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
11 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
12 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
13 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
14 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
15 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
16 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
17 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
18 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
19 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
20 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
21 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
22 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
23 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
24 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
25 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
26 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
27 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
28 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
29 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
30 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
31 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
32 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
33 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
34 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
35 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
36 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
37 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
38 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
39 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
40 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
41 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
42 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
43 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
44 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
45 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
46 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
47 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
48 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
49 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
50 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
51 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
52 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
53 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
54 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
55 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
56 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
57 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
58 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
59 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
60 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
61 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
62 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
63 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
64 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
65 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
66 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
67 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
68 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
69 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
70 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
71 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
72 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
73 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
74 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
75 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
76 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
77 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
78 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
79 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
80 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
81 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
82 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
83 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
84 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
85 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
86 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
87 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
88 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
89 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
90 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
91 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
92 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
93 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
94 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
95 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
96 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
97 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
98 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
99 %   
100 %   
100 %   
100 %   
100 %   
100 %   
100 %   
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
#Compare both methods
plot(log10(GO.wall_DOWN[,2]), log10(GO.sampDOWN[match(GO.sampDOWN[,1],GO.wall_DOWN[,1]),2]),
     xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)",
     xlim=c(-3,0))
abline(0,1,col=3,lty=2)


Output GTF annotation

## All genes from exp per cell line output
col.order <- c(3,4,5,6,1,2,7)
#column.order <- c(1:15, 1:5)
gtf.anno2 <- gtf.anno[col.order]
write.table(gtf.anno2, paste0("GTF_anno.bed"), sep="\t", quote=F, row.names=F)

Save RData

#save.image("LNCaP_DHT_edgeR_31082021.RData"